This document was last updated at 2019-08-10 09:25:56.

This document is dedicated to preprocessing the data from Experiment 2.

Import and view the data:

dst <- read.csv('../../data/dst.csv')
demo <- read.csv('../../data/demo.csv')
rapidFire <- read.csv('../../data/rapidFire.csv')
pracCued <- read.csv('../../data/pracCued.csv')

n <- nrow(demo)

dst

The initial sample size is 8.

## coding some helper vars right off the bat
dst <- dst %>% 
  mutate(ssd = ifelse(selectedRiskyDeck, 0, 1),
         differenceE = ifelse(difference == 'Moderate', -0.5, 0.5),
         difficultyE = ifelse(difficulty == 'Easier than Reference', -0.5, 0.5))

Run time

For piloting purposes, I’m curious as to how long the experiment is running.
So far in piloting, I’ve implemented two different versions: one where there were 8 total cycles in DST and one with 10.

dstTrim <- dst %>% 
  group_by(subject) %>% 
  summarize(dstRunTimeMins = max(phaseRunTimeMins),
            cycleThreshold = max(choiceTrial))

rapidFireTrim <- rapidFire %>% 
  group_by(subject) %>% 
  summarize(rapidFireRunTimeMins = max(phaseRunTimeMins))

pracCuedTrim <- pracCued %>% 
  group_by(subject) %>% 
  summarize(pracCuedRunTimeMins = max(runTimeMins))

demoTrim <- demo %>% 
  select(subject, totalTime_mins)

d <- dstTrim %>% 
  inner_join(rapidFireTrim) %>% 
  inner_join(pracCuedTrim) %>% 
  inner_join(demoTrim)
## Joining, by = "subject"
## Joining, by = "subject"
## Joining, by = "subject"
d
d %>% 
  ggplot(aes(x = totalTime_mins)) +
  geom_histogram(color = 'black', fill = 'light grey') +
  labs(
    x = 'Total Run Time in Experiment (mins)',
    caption = 'Extreme long times usually suggest participant left and came back at some point'
  ) +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Subject Exclusion

Subjects will be excluded for:

badSubjectsList <- demo[demo$vision == 'impaired',]$subject
badSubjects <- data.frame(subject = badSubjectsList, reason = rep('Vision impaired', length(badSubjectsList)))

badSubjectsList <- dst %>% 
  group_by(subject) %>% 
  summarize(error = mean(error))

badSubjectsList %>% 
  ggplot(aes(x = error)) + 
  geom_histogram(color = 'black', fill = 'light grey', bins = ifelse(n < 10, 10, 30)) +
  theme_bw() +
  xlab('Mean Error Rates')

badSubjectsList <- badSubjectsList[badSubjectsList$error > .15,]$subject
badSubjects <- rbind(badSubjects, data.frame(subject = badSubjectsList, reason = rep('Error rate higher than 15%', length(badSubjectsList))))
badSubjects

Even though we’re only analyzing data with less than 15% error rate, the criterion for accepting HITs was error rates over 35% (even though we told workers it was only 25%) or mean cued response times under 400 ms.

good <- dst %>% 
  filter(cuedRt < 10000) %>% 
  group_by(subject) %>%
  summarize(error = mean(error), rt = mean(cuedRt)) %>% 
  filter(error < .35)

bad <- dst %>% 
  filter(cuedRt < 10000) %>% 
  group_by(subject) %>%
  summarize(error = mean(error), rt = mean(cuedRt)) %>% 
  filter(error > .35)

Workers above the 35% error rate threshold:

good

Workers below the 35% error rate threshold:

bad
source('../identitiesAndRejections/computeRejectList.r')
## Joining, by = "subject"
## Joining, by = "subject"

Plot the clustering of humans and bots (although, I don’t expect there to be many bots this time because I implemented something in the experiment to prevent them).

rejectList <- read.csv('../identitiesAndRejections/rejectList.csv')

rejectList <- ifelse(nrow(rejectList) > 0, rejectList$subject, -99)

dst %>%
      mutate(isBot = ifelse(subject %in% rejectList, 'Bot', 'Human')) %>% 
  filter(cuedRt < 10000) %>% 
  group_by(subject) %>% 
  summarize(error = mean(error), rt = mean(cuedRt), isBot = unique(isBot)) %>% 
  ggplot(aes(x = error, y = rt)) +
  geom_point(aes(color = isBot)) +
  scale_color_manual(name = 'Turing Test', values = c(Bot = 'red', Human = 'dark green')) +
  xlab('Mean Error Rate') +
  ylab('Mean Cued Response Time (ms)') + 
  labs(caption = 'Red dashed lines represent the HIT rejection criteria') +
  theme_bw() +
  theme(legend.position = 'bottom') +
  geom_vline(xintercept = 0.35, linetype = 'dashed', color = 'red') +
  geom_hline(yintercept = 400, linetype = 'dashed', color = 'red')

Drop bad data

print(paste('Number of rows before dropping bad subjects:', nrow(dst)))
## [1] "Number of rows before dropping bad subjects: 5440"
dst <- dst[!(dst$subject %in% badSubjects$subject),]
print(paste('Number of rows after dropping bad subjects:', nrow(dst)))
## [1] "Number of rows after dropping bad subjects: 4760"
demo <- demo[!(demo$subject %in% badSubjects$subject),]
rapidFire <- rapidFire[!(rapidFire$subject %in% badSubjects$subject),]

Zoom in on error rates for everyone else:

dst %>% 
  group_by(subject) %>% 
  summarize(error = mean(error)) %>% 
  ggplot(aes(x = error)) +
  geom_histogram(color = 'black', fill = 'light grey') + 
  theme_bw() +
  xlab('Error Rate')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Error and RT Trimming

Response Time

First, dropping all trials with RT > 10 s

The choice-trimming procedures used for the DST phase will also be applied to the rapid choice phase. Summaries below reflect only trimming to DST phase.

initialRows <- nrow(dst)

print(paste('Number of rows before removing trials with RTs longer than 10 s:', initialRows))
## [1] "Number of rows before removing trials with RTs longer than 10 s: 4760"
dst <- dst %>% 
  filter(cuedRt < 10000, choiceRt < 10000)

print(paste('Number of rows after removing trials with RTs longer than 10 s:', nrow(dst)))
## [1] "Number of rows after removing trials with RTs longer than 10 s: 4699"
rapidFire <- rapidFire %>% 
  filter(choiceRt < 10000)

badTrials <- data.frame('Data Dropped' = round(1 - (nrow(dst) / initialRows), 2), Reason = 'Trials longer than 10 s')

badTrials

Second, trials will be dropped based on subject-wise means of rts, separately for both cued and choice

## choice first
initialRows <- nrow(dst)
print(paste('Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs:', initialRows))
## [1] "Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs: 4699"
dst <- dst %>% 
  group_by(subject) %>% 
  summarize(meanChoiceRt = mean(choiceRt), sdChoiceRt = sd(choiceRt)) %>% 
  inner_join(dst) %>% 
  mutate(badChoice = ifelse(choiceRt <= meanChoiceRt - 2 * sdChoiceRt | choiceRt > meanChoiceRt + 2 * sdChoiceRt, 1, 0)) %>% 
  filter(badChoice == 0) %>% 
  select(-badChoice, meanChoiceRt, sdChoiceRt)
## Joining, by = "subject"
rapidFire <- rapidFire %>% 
  group_by(subject) %>% 
  summarize(meanChoiceRt = mean(choiceRt), sdChoiceRt = sd(choiceRt)) %>% 
  inner_join(rapidFire) %>% 
  mutate(badChoice = ifelse(choiceRt <= meanChoiceRt - 2 * sdChoiceRt | choiceRt > meanChoiceRt + 2 * sdChoiceRt, 1, 0)) %>% 
  filter(badChoice == 0) %>% 
  select(-badChoice, meanChoiceRt, sdChoiceRt)
## Joining, by = "subject"
print(paste('Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs:', nrow(dst)))
## [1] "Number of rows before removing trials where choices exceeded participant-wise choice RT cutoffs: 4449"
badTrials <- rbind(badTrials, data.frame('Data Dropped' = round(1 - (nrow(dst) / initialRows), 2), Reason = 'Choice trials with RTs more extreme than two SDs beyond participant-wise means'))
badTrials
## now for cued responses
initialRows <- nrow(dst)
print(paste('Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs:', initialRows))
## [1] "Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs: 4449"
dst <- dst %>% 
  group_by(subject) %>% 
  summarize(meancuedRt = mean(cuedRt), sdcuedRt = sd(cuedRt)) %>% 
  inner_join(dst) %>% 
  mutate(badcued = ifelse(cuedRt <= meancuedRt - 2 * sdcuedRt | cuedRt > meancuedRt + 2 * sdcuedRt, 1, 0)) %>% 
  filter(badcued == 0) %>% 
  select(-badcued)
## Joining, by = "subject"
print(paste('Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs:', nrow(dst)))
## [1] "Number of rows before removing trials where cued responses exceeded participant-wise cued RT cutoffs: 4266"
badTrials <- rbind(badTrials, data.frame('Data Dropped' = round(1 - (nrow(dst) / initialRows), 2), Reason = 'Cued trials with RTs more extreme than two SDs beyond participant-wise means'))
badTrials

Saving out a dataset for error analysis

write.csv(dst, '../../data/dstCleanErrors.csv', row.names = FALSE)

Trimming out error trials and trials following error trials
I didn’t actually say I’dst trim trials following error trials in the document, so I might want to think about that some

initialRows <- nrow(dst)
print(paste('Number of rows before removing error trials and trials following error trials :', initialRows))
## [1] "Number of rows before removing error trials and trials following error trials : 4266"
dst <- dst %>% 
  mutate(errorTrim = ifelse(error | shift(error), 1, 0)) %>% 
  filter(errorTrim == 0)

print(paste('Number of rows before removing error trials and trials following error trials :', nrow(dst)))
## [1] "Number of rows before removing error trials and trials following error trials : 3890"
badTrials <- rbind(badTrials, data.frame('Data Dropped' = round(1 - (nrow(dst) / initialRows), 2), Reason = 'Trimming error trials and trials following error trials'))
badTrials

That should be good.

write.csv(dst, '../../data/dstClean.csv', row.names = FALSE)
write.csv(demo, '../../data/demoClean.csv', row.names = FALSE)
write.csv(rapidFire, '../../data/rapidFireClean.csv', row.names = FALSE)

## save out condensed data with choices only

dst <- dst %>% 
  group_by(subject, choiceTrial) %>% 
  summarize(choiceRt = mean(choiceRt), 
            chosenDeckId = unique(chosenDeckId), 
            difference = unique(difference), 
            difficulty = unique(difficulty), 
            selectedDeckLocation = unique(selectedDeckLocation),
            leftDeckId = unique(leftDeckId),
            rightDeckId = unique(rightDeckId),
            riskyDeckSwitchTop = unique(riskyDeckSwitchTop),
            riskyDeckSwitchBottom = unique(riskyDeckSwitchBottom),
            safeDeckSwitch = unique(safeDeckSwitch),
            outcomeSwitch = unique(outcomeSwitch),
            condition = unique(condition),
            selectedRiskyDeck = unique(selectedRiskyDeck)) %>% 
  ungroup()

write.csv(dst, '../../data/dstCleanChoice.csv', row.names = FALSE)

n <- dst %>% 
  group_by(subject) %>% 
  summarize(n()) %>% 
  nrow()

Final sample size is 7.

 

Analysis Homepage

A work by Dave Braun

dab414@lehigh.edu